Inspect dsrA tree

First I read in the tree. Since some DsrB sequences made their way into this tree, we can midpoint root the tree and get a decent result from that:

dsrA_tree <- read.newick(file = "muller_DsrA_dataset_final.tree")
dsrA_tree_rooted <- phangorn::midpoint(dsrA_tree)

Then I read in the metadata and generated a new name for each of the entries that includes the classification of the DsrA protein and the class of the organism.

dsrA_metadata <- read.table("muller_DsrA_dataset_final_metadata.tsv",
                            sep = '\t',
                            header = TRUE)
metadata_vector <- paste(dsrA_metadata$dsrA_type, ":",
                         gsub(pattern = " ", "", dsrA_metadata$class), "_",
                         dsrA_metadata$seqID,
                         sep = "")
names(metadata_vector) <- dsrA_metadata$seqID

Generate the initial tree:

dsrA_tree_rooted$tip.label <- metadata_vector[dsrA_tree_rooted$tip.label]
ggtree(dsrA_tree_rooted)  +
  geom_tiplab(size = 2) +
  geom_text2(aes(subset = !isTip,
                 label = node),
             size = 2,
             hjust = 1,
             vjust = -1.5) +
  geom_treescale(x = 9)

Looks like there are some dsrB genes in here maybe? Let’s remove those, but keep three in the final dataset for an outgroup.

dsrA_only_tree <- tree_subset(tree = dsrA_tree_rooted,
                              node = 870,
                              levels_back = 0)
  
ggtree(dsrA_only_tree)  +
  geom_tiplab(size = 2) +
  geom_text2(aes(subset = !isTip, label = node),
             size = 1.5) +
  geom_treescale(x = 9)

dsrB_seqs_to_keep <- c("oxi_DsrA:Alphaproteobacteria_JQ256776")
seqs_to_keep <- c(dsrA_only_tree$tip.label)
DsrA_faa <- readAAStringSet("muller_DsrA_dataset_final.faa")
names(DsrA_faa) <- metadata_vector[names(DsrA_faa)]